{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用户数据处理\n",
    "\n",
    "用户描述信息在u.user文件：共7维特征\n",
    "user id\n",
    "age：年龄\n",
    "gender：性别\n",
    "occupation：职业\n",
    "zip code：地点\n",
    "\n",
    "this is a tab separated list of\n",
    "user id | age | gender | occupation | zip code\n",
    "The user ids are the ones used in the u.data data set."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导入工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import scipy.sparse as ss\n",
    "import scipy.io as sio\n",
    "\n",
    "#保存数据\n",
    "import cPickle\n",
    "\n",
    "#特征编码\n",
    "from utils import FeatureEng\n",
    "#用户数目和电影数目\n",
    "from utils import n_Users, n_Movies\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读入数据"
   ]
  },
  {
   "attachments": {
    "ml-100k-user.jpg": {
     "image/jpeg": "iVBORw0KGgoAAAANSUhEUgAAA5kAAACNCAYAAAA9+b42AAAgAElEQVR4nO3dP0/j2tr38SuPpj1DlxSzaaIIaaCAbsgLGAtxGgqaIJBOiaIRSkFP6CkiiogeRKQRBRWKvJu7C9ORAkZCKM3sKRKdZjgvII/XcuzY+Wsnzh/C93OU+54J9vKK7b13fqxrLcf+85//NAUAAAAAgAh8eHx8nHUfAAAAAAAL4sO///3vWfcBAAAAALAgPvz3v/+ddR8AAAAAAAviw6w7AGC+7e7udr13c3Mzg54AAADgLZh5yPR+geWLa7TUue13Tjnv0Rh2HhfhPDv9HnQ/Tdp7OM/zpN+15jyPL8gvbTjP0XHOJffzZHA/Tw//HZwOznN0ZhoyO7/IzPJL7CLp9S/9zp9z3sc37DxynqPBeZ6ufv/+4DxHZ9B54zxHZ9gvWjnP0eg8j16c52jw38Hp4DxH6//NugOInrrhueknj3M8HZxnAGHx5W86OMcA+iFkAgA0vpgDGBX//gDgNfM5mcAieA//cZ315xs0twp4S5jTMx2cZywKdf9yP09e53nGeAiZwJjeQ8CcB/OwANEi47xOR69FUTjvk8Hcqenh/E4WcwGnZ9AcY4RDueyIuPGmY97P86L8i37ez/OimPfzrPrn9HHe+zrIW+77W8J5ng7O83RwnqeD8zwd83CeCZnAiBYlYM67efgX5XvgLBjm3NPc25PB/YxFw38LsYi4r8c305Dp1D47r2HLug8zzW2CmFWfO0cjOkcmBp13znO4bbznkfM8mW36nUfO82S26byPOc/RbsP9PJ1tOM/zsQ3nOZptuJ+nt82gc8h5DrfNzOdkOheM3xZEJ8i55HyPj/M8PcPOI+c5OoPOJec5GtzP08F5ng7O83RwnqeD8xwdymUBAAAAAJEhZAIAAAAAIhP7v//7v+asOwEAAAAAWAwfZlFb/P3796kfEwAAAAAweZTLAgAAAAAi8yZDZnP9RCr1ipysT77SN6pjTbPPGIxrAQAAAEzOmwyZAAAAAID5RMh8R5rNdTmp1KV+mYmszfWTitTr47cZVTsAAAAAZmtCIXNJ1oxd2d1Ny3LIPaMIG5nLut1G63WZ6V8W2cxctra7lEwzePmkXXJp7Vc5kXXPfs6xBx1zUHth95sXtZenuWpnHO496Lw6rnH7nul9nzWbGbns8fN+92OQe54QDgAAgLci+pC5nLbC5ReR379D7eaErAs5l2Jt9MOrkFeQnCQSCf2K50wxCo2e4U2N7OWPjNEPpiS3ZGfDaS8j22M299ZUT9P6PKdPq3PRzrhUmDOzL5KLx+37J54TM5kV82qvY0vT3cZ5HZRi+iexWEkOPO9770UrRouTo4Pc81H9cwEAAABMS8Qhc1nSVr78cWPK42vwvVQ4uzK3pGzEZTM/3khW6cD6Qn9Qar9xfaa/nBvbnSFBZCN/IdmkFRb0l/+QVlOSrBWlaCYltdp6b29bUsUcYeCNUr902NlKWvnxTkqxdmA8s2+gUCPdvdrWv9Awz+W0Ggt0z0f5zwUAAAAwLRGHzF9SualY/zcce+Qnrb98T4v6An+cTUqteCbXY7Rze2e6AXZvOyXlWzsMpFY2Au3vlEE2zKxY8UaPurZLK/0lvF1lmn1KJ/3lwr1XUfWVhHrKQZ15m5WT9b7bhOnLsP4EbaerhNWzXdA+DxOLVeX5RdTF8+23mkqqOl4ZK+btHUs2WZPi2XXrWMPv+Vn8cwEAAACM6x0s/LMqdkbwR4S9q4IYtaIc5h/Ga/76Tkw1yrV+IkepstyGbM4pE40bRVEDoGbOW4J54I6oqTDWKIi/jNModIQtey5gIVUUw93uUGSnYxTX2s/cKuttnHLQi7w/FCezppipc1/JqHebWOmg9TOjf6lngP4Easf67Bdy2C47bX12FSrD9DmI6zPrOqjy2Pu8DpoqtBYMKxwe5qUa84Y9QwqNRt95lv7z0BrFrIW/PwAAAIC3ZuFDpg6TVnwre77dq3luduViZ3AIbmMl1frTtZwVU7J9bP29fCuTyBBOSFGjroPKODfyR9ZnNSW32f5canTu9LTkb9AK10ZrG9XOnWkFNLfm17PNfnvUrec2QwTuzxAqiHrnak6yz7HqqWyqwCtW0LRCpD0/c9M3mugEY+dlWNdBjUB3hl7Xxo7oKtwx7jcAAADgrVjokKkXAVIDSMVDNyTowHaRlaSZcxdqGdfDbVlSRkoHWafkMmy4GcwejVUjdd6SUTOb9G+V8s8n7OvleXjYCbLNsF4H7U8AnSsGF3otsBRBn9UvIO4bpmyVDR0gc6Y9Ytk3QFoe8od6FDa5tdOzPHfvOKvn756NU5cNAAAAvBELGzLtMkcVMA3/iqUdc+OioEa/0lOYO+cvpfWX1Krw7A6uzoEo+2P/ssC/musoazUN4/0FhHPPqIWk1LGS2eO+C/+4czl7tRnBqDkAAADwlixkyFRz+PQonycsOPb0M0aSkjXb8+kaeljMmWMX7nmZ0/EkL31WyHW4QWfMVVCjElV/3MfCRDQiOljv+btP6uRLSvqt5eT2scdIqh7FFFPuGMUEAADAOxFxyFySNWNXdnet15dP1t8/yRf1511D1paiPVI/9gI5hg6YvkeZtOhHnPR8fqEzUnYQKMzoUtAIyjNdD89iZ7LuIKkC2225phfsGbTAjF60RoVlzzMd9cqrJ/1Xfp2kaPpjB2zviq9OGXT07GN5Ry2dVYgHLdpjz/u17p99f5J0RjG9c2kBAACARfch2ub+yKN5I48j7NkVHJIFqdcLoh5eXzQ2A5Wiuqt4Koazf5sqN41qHuZQrVAUNISqhWr2jRW5N739VsF3XwcUtQqtIRUx1SNOvB/LE6btRWue5arhbcM+f9ZPo/pkmipH9s0J7XG9gvRneDtVyR8WZctUC/Fk7RaKhhgvF2JGXB6swnx+07p/7k09qu2eZuccx/r0Wf080f65wxnFzKkVjHvcB0Hu+Sj+uQAAAACmKfbt27ep11Z+//59rP314iz6IfWT/5Id1bGm2WcMxrUAAAAAJmch52QCAAAAAGaDkAkAAAAAiMybLJcFAAAAAMwnRjIBAAAAAJEhZAIAAAAAIvMmQ6ZaHbRSr8jJ+uQrfaM61jT7PE2L+rkAAAAAjOZNhkwAAAAAwHwiZC6IZnNdTip1qV9mZt0VAAAAAO/YBELmsqR3d2XXeaWXA++ZubRCUr39usyEL8FsZi59bfRqq3ub0co9+x1rUkHPLk0d7bzMyvpJJfA1bZ/PS8k0m31+Zr8qJ+uh24nqugMAAADoL+KQuSRrxheRHzdyc6NeP+T3py9irC0N3VMFzILkJJFI6Fc8Z4pRaIwYqEzJxeNuW+p1UIq5P42VDnw/M4oiWfN+xMDRfazEQWmEdhaPuqZm9sU9P/Y17X2e1Uhs/sjo2Y4Kh42CIWau1Y59wXoGzUHtRHvdAQAAAPQSccj8I4/mjVR+OX//Jf/8Fvn46S8ZFjNLBx3h7PpMijURY3sv2i728JA/t6JiUlKrEz/Uu9FsZmTbynpmbl9KsVbA19c0Kdnj7mu6kb+QbNIK7FYQ9bfTCo1mzv1FQax6KufWZsmtHVnvGPHs104vXHcAAAAgeszJVDZWJDXjLnSWlXpLbp2fNcysFYlEj/C2t+0uLfW1VTnpCmJdZaMd5b3653q/jFyOWr6sz2lNXp7ab8ViVbkt10RSK74+qUB6nE1KrXgm113t7MiW9aHNu2vf9irAWilTdjYkWDt9+wgAAAAgShMOmcvy1yeR19//yJ/Q+65KygoXNW9KCcyQQqMROBztHVvhrVaUs0DJJHoqFPrKSuM5MY2CG/6qp2m3TNSKaG7ZqP06aI8UKtZ+5lZZDKst3U4yKxf5dhKzS0+l77Fc1n6mvWGrtLRmNX3VFWgH6zNKmEyJ9+29q4IY1vk/zD90b7uaslpph1U1L/Ve9auozoW//YHt9DDr6w4AAAAsogmGTHt+5if5LT8fw0dMHRisGFG+DRYYHN3z7mp65K9z/p53NK9g1KR4mJeqN6wF5g+0wxal6eSMvnnLSmOxkuyrck/jKPx8QSs0GZv2Z1Ht3Kmy0lYSc0pP1Uif91hndl1yV4BUYdYpUX24LVtXIyUrGxLMw62oQUvjKO+OWqqAeJFN+j+/9Z6uhj0ffv51wDS3pGzEZf+2+2dB2onuugMAAADoZWIhczn9VT5/fJWff1fk1/DNffQiQIbKS4dyWh0vADzkD/Xczs75e94wGjfKsmV2B9Fguhf+SZ9Wg+++t63DdNeA7dNL10hdIC/PA0KTPTqczJq+UGx2BD+bKZ4KVT0PMp1IB74eqjQ2v2mPpJqtEN64EDlXF6P2Iurj6tB7kZWkZ75lXztX0tABc7OrD2Haie66AwAAAOhlIiFzac2QL59UwDQl7CCmKh21A6YRLqz1ocLO88uQbQYsJLOI/OW2fcpuI6BGSQ+8x0ifik65ThDeO5ZssibFQfWqrbCdzarKXU/A9JbRBmmnV//e2XUHAAAApiHykKkC5tfPMlLAVKWMelTNzEUSMHWbziIxA0f4ZqjfiKUOUf7RxAgOJi9TWrG3F3fF2daH2rNX75Gs2S43Vo8qaZcgX8peq+zW2skXgjdWUiK1sqhq6iDthJtLCgAAAGBUEYfMZVn9/NH6/x/l89dd2d1tv9LLg/d0noWoAmaUz5m053aaktvvn9bUsQsB5wVGzRlN8y6qo8LYVcE/d1J7eBY1KDtqSHRXdzUKIz5/dHTOgj2pouGWtOrH1nSMqMb1o0ecEmQ1ulqVvH2C3D47czud6xWsne7rOsvrDgAAACyqD9E290sqN2FnYHqehaiolU7rBd/PvQvQDGOv1OqZY6hCa6KkEpb7ljPn07ORDiNRl4sGpUKSqD41GuJ88l7lwnpBIGNF7k3vOVJ93w/2yA6xV6o1xDpH6jEo3tMccbhX/NeiJkUjPtIcWzWPMi7qlxDtPoe5Jxzzdt0BAACARRT79u3b1OsIv3//Ptb+7VVGuxeBiVpUx5pmn6dpUT8XAAAAgNFM+DmZAAAAAID3hJAJAAAAAIjMmyyXBQAAAADMJ0YyAQAAAACRIWQCAAAAACLzJkOmWtG0Uq/IyfrkK32jOtY0+zxv3vNnBwAAAN6bNxkyAQAAAADziZC5gNZPKlKv16V+mZl1VwAAAAC8M5GHzKU1Q3Z3d9uv9HKg/ZqZSzsYua/Ryyszl3VfW5cZfztRHMsuAbX2rZzIerO9r3PszmPOQu3lqef7Tt/noY9Br8Wwa9qrrcrJeqBjOm01mxm5rNc7+jP4mAAAAAD8PkTd4J9HU24enb8tS3r3i6SXf0nl1+D9YqUDSZTaf1ejcaZ5L2Jsymk1Fvj4KowUJCeJVmMqUDQKDbmUuByUYpEeS0tuyc5GXqpVO6RsG+F2n4TqaVoSp7PuRTBBrkWQa2q/Z4iZs99TQfreNKUihqRPq277ui3DlFw8LqWYc61b90WsJAfezrTYbaekT2YHAAAA4DHhctlXeX0V+dfHpdB7PuTPxZSkpFbD7Vc6SEjiwBMUrs+kWLMyy/ZetMdaTUmyVpSi6dlvb1tSxZw+HkbT61oMu6bN5rrkj6x0b+bav0ionsq5qX4HsOOONKuwaAfMfU/AHKzd9nn4X0AAAAAA79BkQ+bSX/Lp46v8/udP+H03ViQVfY8iP9btnemGnb3tlJRv7eGu1MpGoP11GWifuZN6bmWrHFeXeOo/+0s6vSWcXaWnHe06czUbZtaKcVZIKzQ8219Kphm8rSD9Gcko12JjR7asD2TeXbf754wq65Fm+709641a8SxwwLR3OpZssibFs+vh2wIAAACYTMhcTrfmY379LPLzhzyOkDH3jrN6pHD87/arkkr2n5849rGu78Q0tiWzfiJHqbLcPozQRmrFN6/Tsao6/vIsVScUJbNiNgoiubgkEgkxijUrKF654VCXnlrvx+NGz9FUXUarfm4URf3YbLVjvw7c8GWXh4ouKbXby1mfsdAdhof0ZxTBrkXHNVWjytYncv6qS2VVv4rqc9qjok7ofHle9c+77JhT6+WOYtZGvK4AAADAOzSRkPmrciM3N+r1t/z+9HWkxX8KRk2Kh/l2wBrR3lVBDCtqlDtSwrjH2lhxxtuu5ayYku1j6+/lWwmbRZ5e2mnQ+zxJFXDUITrDsTPnUHm4LVufLCUBB00DcYKVd8RPzVU8s+tTuwJkFP0Jey36XVPdlp6LuSVlIy77t937GoVtuXPDsxXGxQrKV31KqZ0R0vPx70MAAADgvZjwnMw/8vjzt8invyRIzHRG4uzRtrJsmY2+K4QGYS/yYgW14mHXfLooj6XCVcpI6dATi1Xl+UUkGXCC54O9saitN3a2JGn9b2unX0ozxVMRqucdphPpiOcK2qOEyazpK5c1s8mJ9SfMtRh0TWXnSho6YPZfwMnM7XvCc1XyauKmcdRzRdvoRtMBAACA92M6z8l8fZXXkLv0WrglDDX/0A4j/tVFJ3GsscLe04s7+reaEinmimKlTNlohb2X59nUafpLabtLaidl0LXoe031OUxKNqsqdz0Bs6OMVgk6V1aNiNrr/TCKCQAAAIQx4ZC5LOkvn+T19z8ywrTMkanySz3yZuaGBsyZe3iWF/2HPdlWczqvb6UszmI1tRk8NuNJXoasxjsLA6/pg3XO9CTTO18I1iXN7nxK+3N1jjDbZc8v0pnl9Shmx0gtAAAAgOEiDplLsma0Fv3Rry8iP27EHGHlH/txE+FHkpznJaow4nvsRcTH6lqUZ2Qq/CRl6+JIUuVbq72q3JZFsurvPcJPJFrBtleQjOnjq5RZGH+l2BH0uhbDrmm77LXdZzUSeWGFUqedXqWxajGgY2ubzhVnnVHM0CvRAgAAAJAP0Tb3Rx7NG3kcYU9nrl2bqVc3DfMl310NVFGrodYLvp87i9REcayhWivGDguhzhzOrNWfoh5yi9kL6GTVfMCyhBnIVOWkvrmTSecc1KTomaeoFvLZN1bk3vSeo/bzI9UqtIZYbalHnHhPYYjgHtSwaxH0mqp5nXFRYbTdZ++iRPpzW9sYK9bnMhuSbb3Xq5zaGcXM5R/UyYrw0wIAAACLL/bt27epD1d9//59rP3bK4j2X+AlKlEda5p9njfv+bMDAAAA7810Fv4BAAAAALwLhEwAAAAAQGTeZLksAAAAAGA+MZIJAAAAAIgMIRMAAAAAEJk3GTLVaqWVesV93uFbONY0+4zBuBYAAADA5LzJkAkAAAAAmE+EzHek2VyXk0pd6peZyNpcP6lIvT5+m1G1AwAAAGC2Jhgyl2TN2JXdXUPWlsLt2cxc2oGjfimZZviSxsxlvbW//brM9G9j1GPZJZfWfpUTWffs5xx70DEHtRd2v3lRe3maq3ai0O/eaL/vvLpLb7u36b4vhrXTbGbkskcbQe5rAAAAYFY+TKrhpbUv8llerf+Fo0bb8kfGyMdVIa8gOUkkSnZ71hf5RqEhlxKXg1Is0mNpyS3Z2chLtWqHgu0xm3trqqdpSZzOTztRGXRvxEoH0rq9NDUKa5r3IsamnFa995gpufi+lGLe99p/HtZOLFaSA+8GTt/0PZ2SOcrjAAAAgGtCI5nLsvpZ5OePn/K/kHtu5C8km7S+nOfMkY5cOkhI4sDzxfz6TIo163v79l60x1pNSbJWlKKZlNRq6729bUkVc/p4eNvC3BsP+XMrTnrugxEFaccNv+Z5R6AFAAAA5sNEQuZy+ot8+v1THv+E20+NBB5nk1Irnsn1JDo2gWPd3plugN3bTkn51h5eSq1sBNrfmYvYMLNWvLDCcKHhKYkcUqbZZ/6iv1y49yqq7hzIjpJfZ95m5WS97zZh+jKsP0Hb8fWlY7ugfQ4q9L2xsSKp0EcZsZ29Yyv81qR4Nul/QgAAAIDRRB8yl9bk86ff8qPyK/Sue1cFMWpFOcw/RNihVUklu+f6RXas6zsxjW3JrJ/IUaostyGb02WiiYTEjaKoAVAzF9d/t18HbqmlXSIpkovbP4/Hc9ZxCx1hy57DV0gVxXC3OxTZ6RjFtfYzt8p6G91OMisXeX8oTmZNMVPn7WN1bKNLPfXPjL4jt0H6E6gd67NfyKF7XpzPrkJlmD4HFfbe2DvO6lHt7txnSKHRCDyHsn87NncUsxb+PgMAAACmJeKQuSRrX1Sd7JOEjZhq4Ru7CjAv1Vh0ZYA6MFjxrez5Vh7FsTZWnDGnazkrpmT72Pp7+VYm8d3fCRdqZM0JnWq+3pldB+yOdm7kj6zPakpus/25YrGqnJ52zOuzgozR2ka1c2daAa2zRlNts3/tHqvnNkME7s8QKoimT6vtv0+wz0HvDe8IbMGoSfHQv70Tnp2XYV0rNUrdGYyHteOzsSNbyej/GQEAAACiFG3IXF6Vz/JTfoSsk9Uh6iIrSTPXtTjPOPQiQGrgp3jozl+bxLEebsuSMlI6yKoQ9fwSPtwMZo/GqpE6b8momU36t1IbmXcdC8308PI8PKQE2WZYr4P2J4DOFYMLvdbkGbPPYe4Nb4iMG2XZMrsDpNdD/lCP1Ca3dnwlvGHaGTbSCQAAAMyDSEPm8l+fRD5+lq+76tEl6vVFPslH+fzV+rOxJn2fZDKBeWZqbp4dMA3fKNgkjhWrnko6kZ74Qiz+Ulp/Sa0KSCuRTAyMRpT9sX9ZYLqlwuo14rpQg414b6jrf252B0jfNq1fPozazqRG+gEAAICoRfoIk1+Vm44y2WVJ736W17/NgYsA7bWe+5E1G5Lt+Jma01bo+SiI/lQJoh7lM3P+gDmBY03Hk7zUrD6rBYZKvUtN3RCjy2evZ97/qPrjPhYmohHRQSZ5b7Q/x2ijrXoU0zr+ucq/83RrAgAAAB0m9AiTcPRjRzpG6OJ6qMoZvToIFTAbagjTCpi+R5lEfCxdChpBSanr4VnsTNb9qBUV2G7LNb1gz6DFY67P1OJBhhSu2m3olVdP+q/8OknR9McO2JJacUf3nDLoqI16b6h7rjBklNGeG2y1s99/lLRfO84opndOLgAAADCvIh3J7PZLKjfhV5kdlbv6pqJWXq0XfD9X5aZRzvkcqBWKgoZQtVDNvrEi96a33+2RM7UKrSEVMdUjTrwfyxOmVbnlZvxZrhreNmpSNDYl6uEvVY7smxOadI5pH0+VDgfpz/B2qpI/LMqWmRWzYY8vqhJo4+VCzBmVB3eHXDuEegNg1+dS1ylRUhc6VDuKM4qZU6vdEjIBAAAw52Lfvn0L/yDBMX3//n2s/dXIzr25JeVWmJmkqI41zT5jMK4FAAAAMDlzUS4LAAAAAFgMhEwAAAAAQGTeZLksAAAAAGA+MZIJAAAAAIgMIRMAAAAAEJk3GTLV6qCVekVO1idf6RvVsabZ52la1M8FAAAAYDRvMmQCAAAAAOYTIXNBNJvrclKpS/0yM+uuAAAAAHjHIg+ZS2uG7O7u+l7G2tLQ/ZqZS6nX612vy0y4MszudrpLOYNsM9qxWq8JBT27NDX8OZm19ZNK3/MS9LpnLoPdF+GORZkvAAAAELUPE2n19w+5qfwaYUdTcvF9KcVinvdifbfuJVY6kESp/XcVOkzzXsTYlNNqLPA24/UZigrF92ZWpJiTYq0g2b5bDr7uKmAWJCeJ1kVTYbFRaMilxOWgFAt8rGivOwAAAIBeFr5c9iF/bkWYpKRWx9sG4TSbGbkyt6RsxGUz/zRWW6WDhCQOPOnw+swKklY23N4b61hcdwAAACB6Cx8yZWNFUlFsM2FumWePklvnZw0za0UiK1wVGp5tLyXTbPZvq3Ii680h5cIdpaX653q/jFyOWLoci5XkIJGeygjhyMeag+sOAAAALJrJhMxPX9pzMo01GT4j02FIodEYKdT0s3dsBbNaUc6ux9tmknTZZvZFcvG4JBIJicdzYhoFN/xVT9P2+0ZRatbfzZy9nf068JeZWvuZW2UxrLZ0O8msXOQ33B/bpabS91guaz/T3lBvZxRrVtNXXYE2GmGv+6qkrLRdexlvhHTW1x0AAABYRJGHzD+Pptzc3LReP+T3x8/yNUDQ1PPl3ODkhJqGVE7WQ/fBO1JXMGpSPMxLtWPOZJBtgvEHJPUK02dV6nmcTVrBsT0nUY3M7edMq+mj8AvTWKHJ2LQ/i2rnzmom2aoHVSvQ5o8Ma5Mz37HO7NrTrgCpwqwz5/HhtmwF3JSsbEikRrnue1cF66zXpHz7EPp40V13AAAAAL1MuFz2l1R+/Bb5+FE+htzzIX+o590lt3a6yj2H8QaXuFGWLbM7tATZJhjTHRV0XunTavDd97Z1YOoalHt6sd4dYb7gy/OA0GSPACazpi8Um9lkj21NufOM8MWqp5KeQvnrsOuuFwEyVJY+HKkv0V13AAAAAL1MfE7m0sd/jbRfLFaV55fxj6/C0bk5OKwG2WaR+Mtt+5Tdzsig667Kiu2AaYQL8v2O9c6uOwAAADANkw2ZS2vy5fNHef35JGEfaKLKSLetQDF4ZG4B9BuxXE1Z7/pHEyM4mLx4VmWdR/2uuypz1SOuZi6SgAkAAABgMiIPmcvp3faiP18/y/9+3Ij5+Cd0O/a8O1Ny++OlLBVO1OiXed5/7l2QbSbFGU3zLqqjH8lR8M+d1B6eRQ3yjRoS1SjhbbmmFweKYlGlSeh13e3FigwdMH2PMhnTLK87AAAAsKg+RN3gr8pN6FFLxV5h1TM3UAWKREklo1DtOHP2PA3pOZPesBZkm2lSz4EU1adGQwqt93qVhOoFgYwVuTcLUq87W6q+70vQKK5WqjXEOtfqMSgFzw8iDnBK13lOOv2uSdHY1HMqh113Z7EizfB+bmdze3GiIMeat+sOAAAALKLYt2/fpj6k9f3797H2b66fWEFrS8qt8DBJUR1rmn2epkX9XAAAAABGM/GFfwAAAAAA7wchEwAAAAAQmTdZLgsAAAAAmE+MZAIAAAAAIkPIBAAAAABE5k2GTLWiaaVekZP1yVf6Rq/DaSkAAAfjSURBVHWsafb5PeM8AwAAALP1JkMmAAAAAGA+ETIxcesnFanX61K/zMy6KwAAAAAmbEIhc0nWjF3Z3bVfxtpS4D2bmUs7kLRel5nRyx7bbV1KpulvJ3NZH+s4dlmmtW/lRNY9bTvtjtPvt8I5B0E/a+3lacI9Gs4NvAOue+c2nde4a7s+4bnzXq6crA/uEyEcAAAAC2ACIXNZ0rtf5dPvv+Xm5ka/zMc/gfZUAa1REMnF45JIJPTroBQbqRfN5rrkj4y+xylIzj1GPGeKUWiMFgyTW7Kz4RwzI9u9D/muVU/T+jynT6sz7Ye67mb2xb2/7Ot+75u/qQKfb5t4TsxkVsyrPXcbJ1xfyLkUa72PpQJmo2CImWu1YxRFsqYvaAZpBwAAAHhrIg+Zy+kvVsD8EThYOtSX8oJhWl/u96UUGy1Yem3kLySbtNqzgkSn0oEVLg9K7Teuz/SXfGN7r2vbvlZTkqwVpWgmJbXaem9vW1LFHIFhDjm/ADBznvtLX/ekZI/3Wtusy85W0trozt0mFivJmX1z6NFw1c6VuSVlIy6b+d4js+4vOMyc+0uSWPVUzk31O4kdPSoapB0AAADgLYo4ZC7Jx3+J/P7nV+g996wEUCueRRIw1Rf442xSt3c9dmuD3d6Zbjjd205J+dYODKmVjVDtdJZWeksn3dLcjrJfp8zSNzo2oB0vf7lwezVWFZBOKt376e1b7znHbZhZsSKZHgVut9Xu47C+2D/vXgnW+byjfK6+NlYkJTXxVuzGYlW5LdfUxdLBT/39+UXcvztWU0lV6ytPYofOg0RaTqsD7tONHbGzavvuc0e5WyPfgdoBAAAA3qCIQ+ZH+fjxVV5lTYxdZ06mIcOmZDpfwF+eV+UywFy4YfauCmLUinKYfwi4x6rYOWKEEaXrOzHVKJcVjI5SZbkNekgPu7RS/CWaRsENUmoUbNMoWhHJkKO8HV5VELvQQdpwy1CHtaO3sc61OseFVFEMd7tDkZ3go7hO+Wtc90ncklD7ddAeBSwdtNo3eo/uqnMn7ZFEx8bOlvWuKeet6xfkcwXjGXX2vZ0S5+3rM+szqfLY+7y+91SgLhg1KR7mpRr0FyBqlNsTaNW1ulcfoKjOV58+AAAAAAsi2pC59FH+ZQXNz59FfrTmY/79U+Tz17QsB9jdKGzLnRskrGAi/rlwQagv9LpS8Tx4KNCh1Pr6Xw6REDdWUq0/XctZMSXbx9bfy7cSNmM6pZXeUdzOEk39ngqaOSuSZS/kZF2VWmYlaebaATNgOxv5I+uzmpLbbJ8fNYJ3elrq7NrE9epfZ8lq0M811MOtqEFL4yjv/uLCCeq+Pqnz7Nx7jUZrfubmyCOOOmC2ymL3b0dqAgAAAHhTog2Zf17lf/IqP388ijMj88/jT/ktn+SvACnTO19OBZ+8msRmHHWVU/ajA8mFHb6CLhikFwGyQmmteDhykHi4LUvKSOmQ6pRcJgMPV9mjqMms6SsHNTvCj6JGBnNmUrJmwQ6K+9eh29Gln545h7Omzp0aoXWnw+pS05oUz5zPFvz8DKLvp83WIj4Nu7y3cSFyrsJqqxRWsUcdTdkqG/qXHTnTkIK1fb+VYQfauZKGDpijh1QAAADgrfkQfZOqZNb6f751f17l9XX4nvY8xjFWIN07lqwKKIcqoAz/Um+XQoqv5HQUavQrnTgNdMx+VMlpz2DsCYMqRLsDqCO04+7/MnI3I2cviJOVgkqZpZJdKltrlR17PkaQ8zP0WHoepH/ENnOZVXXaelTX+0uKROt+UItEifpFRPZYMvmAi1I9vejgnG2Ngrr7tMpoy6zzAwAAgAUW8ZzMX/L081U+fV4TZxqmXm329bf8M3Cx2Sd5qXWP/m20EtFzwBrUPXtlFcma7YVo1GMkROzRqM5FafRomKfkdDbszx5kZVt3xVw1J1F9pvu8Z87q8HbchW3ClJlKsHA7jus7s9Unu1TWX+oc/PyE5a44e+cfNe2cm/ukOiApCbyWU6s0t3PEWN/PtdHm7QIAAABvReSPMPnzaMqP/32Wr62Ff77866f8bT7KoIzZqzTWu0Js0NJO/WiShP8V148wMVuLxtiL0jjPMFQB0/cokxB02WlrBGwc7gqnRmHgczqdUGyXFJdkX32uZFYuWgsBBW1HL2yjAqr3uY9qRdmTTEc7ziM71iV/b0rP6tSHZ7Ez65gBUC8AZMh2XpXKmuJZlDXw5wrLWYwnVTQ8I6StX3aoUcum/z4MEw7b93O7z878zzBzhQEAAIC3aALlsiK/KjcS9iEmar6hsVIR02xItvXeuGWsvbjPMFTUCqX1gu/nfcsyw2o9BiNIoFCrtRpifXb1KBBvd1oh2BuK3ecuqvmZ26qM05TLZ7vPw9rR++mFbZ7lquH97DUpGpvi1Kc+5A+luGXq0V+1hboOhlyI2TGaqcpP940VuTe9bbWfdarKkX1zJ5POdvbxnHmK9kI+R9a2WX2szl8qBPlcQfj7o/oQ982VtOdtWuf5vv3Zfcdp9cuZxzvoc6nrExd13dp97ry3grQDAAAAvDWxb9++RTc8FND379/H2r+9Yufkv4hHdaxp9vk94zwDAAAAsxV5uSwAAAAA4P0iZAIAAAAAIvMmy2UBAAAAAPOJkUwAAAAAQGQImQAAAACAyLzJkKlWEK3UK+4zNd/CsabZZ8wPrjsAAADem/8PYSxkxJOC+DUAAAAASUVORK5CYII="
    }
   },
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用户数据文件为u.user，文件格式为u.user\n",
    "![ml-100k-user.jpg](attachment:ml-100k-user.jpg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  age sex  occupation zip_code\n",
       "0        1   24   M  technician    85711\n",
       "1        2   53   F       other    94043\n",
       "2        3   23   M      writer    32067\n",
       "3        4   24   M  technician    43537\n",
       "4        5   33   F       other    15213"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读入数据\n",
    "u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']\n",
    "users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')\n",
    "\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 943 entries, 0 to 942\n",
      "Data columns (total 5 columns):\n",
      "user_id       943 non-null int64\n",
      "age           943 non-null int64\n",
      "sex           943 non-null object\n",
      "occupation    943 non-null object\n",
      "zip_code      943 non-null object\n",
      "dtypes: int64(2), object(3)\n",
      "memory usage: 36.9+ KB\n"
     ]
    }
   ],
   "source": [
    "users.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FE = FeatureEng()\n",
    "\n",
    "#n_users = users.shape[0]  #直接在utils文件中设置\n",
    "# 'age', 'sexID', 'occupationID', 'zip_codeID'\n",
    "\n",
    "n_cols = users.shape[1] - 1\n",
    "cols = ['age', 'sexID', 'occupationID', 'zip_codeID']\n",
    "\n",
    "#users编码后的特征\n",
    "#userMatrix = np.zeros((n_users, n_cols), dtype=np.int)\n",
    "userMatrix = ss.dok_matrix((n_Users, n_cols))\n",
    "\n",
    "for u in range(users.shape[0]): \n",
    "    userId = users.loc[u,'user_id']-1  #索引从0开始\n",
    "    \n",
    "    userMatrix[userId, 0] =users.loc[u,'age']\n",
    "    userMatrix[userId, 1] = FE.getSexId(users.loc[u,'sex'])\n",
    "    userMatrix[userId, 2] = FE.getoccupationId(users.loc[u,'occupation'])\n",
    "    userMatrix[userId, 3] = FE.getzip_codeInt(users.loc[u,'zip_code'])\n",
    "\n",
    "#df_FE = pd.DataFrame(data=userMatrix.todense(), columns=cols)  \n",
    "#df_FE.to_csv(\"EV_userMatrix.csv\")\n",
    "# 归一化用户矩阵\n",
    "userMatrix = normalize(userMatrix, norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"US_userMatrix\", userMatrix)\n",
    "\n",
    "# 计算用户相似度矩阵，之后用户推荐系统\n",
    "userSimMatrix = ss.dok_matrix((n_Users, n_Users))\n",
    "\n",
    "#读取在测试集和训练集中出现的事件对\n",
    "uniqueUserPairs = cPickle.load(open(\"FE_uniqueUserPairs.pkl\", 'rb'))\n",
    "\n",
    "#对角线元素\n",
    "for i in range(0, n_Users):\n",
    "    userSimMatrix[i, i] = 1.0\n",
    "    \n",
    "#对称\n",
    "for u1, u2 in uniqueUserPairs:\n",
    "    i = u1; #int(u1)-1\n",
    "    j = u2; #int(u2)-1\n",
    "    if not userSimMatrix.has_key((i, j)):\n",
    "        usim = ssd.correlation(userMatrix.getrow(i).todense(),\n",
    "          userMatrix.getrow(j).todense())\n",
    "        userSimMatrix[i, j] = usim\n",
    "        userSimMatrix[j, i] = usim\n",
    "    \n",
    "sio.mmwrite(\"US_userSimMatrix\", userSimMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sexID</th>\n",
       "      <th>occupationID</th>\n",
       "      <th>zip_codeID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>24.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>85711.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>53.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>94043.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>32067.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>43537.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>15213.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    age  sexID  occupationID  zip_codeID\n",
       "0  24.0    1.0          20.0     85711.0\n",
       "1  53.0    2.0          14.0     94043.0\n",
       "2  23.0    1.0          21.0     32067.0\n",
       "3  24.0    1.0          20.0     43537.0\n",
       "4  33.0    2.0          14.0     15213.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_FE.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
